#### "Reanalyzing the Link between Democracy and Economic Development" ####
# journal: International Area Studies Review
# authors: "Pelke, Lars"
# date: 2023-07-24
# written under "R version 4.1.2 (2021-11-01)"

#### Preliminaries ####

R.version$version.string

# clear workspace
rm(list=ls())

# set your working directory 
# setwd()
 
# loading packages

library(countrycode)
library(tidyverse)
library(viridis)
library(ggpubr)
library(texreg)
library(sjPlot)
library(ggeffects)
library(readstata13)
#library(vdemdata)
library(imputeTS)
library(stargazer)
library(haven)


#### Import Data ####

# Load VDem data
vdem <- readRDS("data/vdem_10/Country_Year_V-Dem_Full+others_R_v10/V-Dem-CY-Full+Others-v10.rds") 

#### Regime duration ####

vdem <- vdem %>%
  mutate(autocracy= ifelse(v2x_regime<2, 1, 0), 
         autocracy= ifelse(is.na(v2x_regime), NA, autocracy))

cumsum.na <- function(x) {
  x[which(is.na(x))] <- 0
  return(cumsum(x))
}

vdem_auto <- vdem %>%
  group_by(country_id) %>% 
  mutate(democracy = ifelse(autocracy==1, 0, 1),
         democracy = ifelse(is.na(v2x_regime), NA, democracy)) %>% # get binary democracy measure for constructing years under democracy
  mutate(autocracy_years = ifelse(autocracy==1, cumsum.na(autocracy), NA)) %>%  # years under autocracy
  mutate(democracy_years = ifelse(democracy==1, cumsum.na(democracy), NA)) %>% # years under democracy
  ungroup() %>%
  dplyr::select(country_id, year, autocracy_years, democracy_years) 

vdem_auto$autocracy_years_na <- replace(vdem_auto$autocracy_years, is.na(vdem_auto$autocracy_years), 0)
vdem_auto$democracy_years_na <- replace(vdem_auto$democracy_years, is.na(vdem_auto$democracy_years), 0)

vdem_auto <- vdem_auto %>%
  mutate(regime_duration = democracy_years_na, 
         regime_duration = regime_duration + autocracy_years_na)

vdem_auto <-vdem_auto %>%
  dplyr::select(country_id, year, autocracy_years, democracy_years, regime_duration) %>%
  mutate(regime_duration = ifelse(year<1900, NA, regime_duration))

vdem <- vdem %>%
  left_join(vdem_auto, by = c("country_id", "year"))

vdem <- vdem %>%
  distinct(country_name, year, .keep_all = TRUE)

vdem$cowcode[vdem$country_name == "Hong Kong"] <- 715
vdem$cowcode[vdem$country_name == "Palestine/West Bank"] <- 1020

#### Episodes of Regimes Dataset ####

devtools::install_github("vdeminstitute/ERT")
library(ERT)

ert_data <- ERT::get_eps(
  data = vdemdata::vdem,
  start_incl = 0.05,
  cum_incl = 0.1,
  year_turn = 0.03,
  cum_turn = 0.1,
  tolerance = 5
)
table(ert_data$aut_ep)
table(ert_data$dem_ep)

ert_data <- ert_data %>%
  group_by(dem_ep_id) %>%
  mutate(last_v2x_polyarchy = last(v2x_polyarchy), 
         first_v2x_polyarchy = first(v2x_polyarchy)) %>%
  ungroup() %>%
  mutate(v2x_polyarchy_increase = last_v2x_polyarchy - first_v2x_polyarchy)

ert_data <- ert_data %>%
  group_by(aut_ep_id) %>%
  mutate(last_v2x_polyarchy = last(v2x_polyarchy), 
         first_v2x_polyarchy = first(v2x_polyarchy)) %>%
  ungroup() %>%
  mutate(v2x_polyarchy_decrease = last_v2x_polyarchy - first_v2x_polyarchy) %>%
  dplyr::select(-c(last_v2x_polyarchy, first_v2x_polyarchy))

ert_data <- ert_data %>%
  mutate(v2x_polyarchy_increase = ifelse(is.na(dem_ep_id), NA,v2x_polyarchy_increase )) %>%
  mutate(v2x_polyarchy_decrease = ifelse(is.na(aut_ep_id), NA,v2x_polyarchy_decrease )) %>%
  dplyr::select(-c(country_name, country_text_id, v2x_polyarchy,v2x_polyarchy_codelow, 
                   v2x_polyarchy_codehigh, v2x_regime))

vdem <- vdem %>%
  left_join(ert_data, by = c("country_id", "year"))

#### Data on Economic Development ####

## Data rom Fariss et al. (2021) posterior distributions

df_gdp <- readRDS("data/jcr2020/jcr2020/data/estimates_gdp_model_combined_normal_noslope_gamma_lambda_additive_test_20210827.rds")
df_gdppc <- readRDS("data/jcr2020/jcr2020/data/estimates_gdppc_model_combined_normal_noslope_gamma_lambda_additive_test_20210827.rds")
df_pop <- readRDS("data/jcr2020/jcr2020/data/estimates_pop_model_combined_normal_noslope_gamma_lambda_additive_test_20210827.rds")

dat_long_base <- bind_rows(df_pop,
                           df_gdp,
                           df_gdppc)

#### Preprocessing
df_names <- dat_long_base %>%
  dplyr::select(indicator, group) %>%
  unique() %>%
  mutate(fullname = case_when(
    indicator == "deng_pop" ~ "Population Deng (2004)",
    indicator == "BroadberryKlein_pop" ~ "Population Broadberry and Klein (2012)",
    indicator == "CINC_tpop" ~ "Population CINC 5.0",
    indicator == "Maddison2020_pop" ~ "Population Maddison Project (2020)",
    indicator == "MW_pop" ~ "Population Measuring Worth (2019)",
    indicator == "PWT100_pop" ~ "Population Penn World Tables 10.0",
    indicator == "WorldBank_pop" ~ "Population World Bank (2020)",
    
    # GDPPC
    indicator == "Bairoch_gdppc_ppp_bc_1960" ~ "GNP p.c. (PPP, \u03B3) Bairoch (1976)",
    indicator == "Broadberry_gdppc_ppp_bc_1990" ~ "GDP p.c. (PPP, \u03B3) Broadberry (2015)",
    indicator == "BroadberryKlein_gdppc_ppp_bc_1990" ~ "GDP p.c. (PPP, \u03B3) Broadberry and Klein (2012)",
    indicator == "Maddison2018_gdppc_ppp_bc_2011" ~ "GDP p.c. (PPP, \u03B3) Maddison Project (2018)",
    indicator == "MW_gdppc_con_bc_2010" ~ "GDP p.c. (exchange, \u03B3) Measuring Worth (2019)",
    indicator == "WorldBank_gdppc_con_bc_2010" ~ "GDP p.c. (exchange, \u03B3) World Bank (2020)",
    indicator == "WorldBank_gdppc_ppp_bc_2017" ~ "GDP p.c. (PPP, \u03B3) World Bank (2020)",
    indicator == "Maddison2018_gdppc_ppp_bt_2011" ~ "GDP p.c. (PPP, \u03BB) Maddison Project (2018)",
    indicator == "Maddison2020_gdppc_ppp_bcbt" ~ "GDP p.c. (PPP) Maddison Project (2020)",
    
    # GDP
    indicator == "Bairoch_gdp_ppp_bc_1960" ~ "GNP (PPP, \u03B3) Bairoch (1976)",
    indicator == "BroadberryKlein_gdp_ppp_bc_1990" ~ "GDP (PPP, \u03B3) Broadberry and Klein (2012)",
    indicator == "MW_gdp_con_bc_2010" ~ "GDP (exchange, \u03B3) Measuring Worth (2019)",
    indicator == "PWT100_gdp_ppp_bc_2017" ~ "GDP (PPP, \u03B3) Penn World Tables 10.0",
    indicator == "WorldBank_gdp_con_bc_2010" ~ "GDP (exchange, \u03B3) World Bank (2020)",
    indicator == "WorldBank_gdp_ppp_bc_2017" ~ "GDP (PPP, \u03B3) World Bank (2020)",
    indicator == "PWT100_gdp_ppp_bt_2017" ~ "GDP (PPP, \u03BB) Penn World Tables 10.0",
    indicator == "PWT100_gdp_ppp_none_2017" ~ "GDP (PPP) Penn World Tables 10.0",
    indicator == "latent_pop" ~ "Latent population",
    indicator == "latent_gdppc" ~ "Latent GDP per capita",
    indicator == "latent_gdp" ~ "Latent GDP") %>%
      factor(levels = c('Latent population',
                        'Population Penn World Tables 10.0',
                        'Population Maddison Project (2020)',
                        'Population World Bank (2020)',
                        'Population Broadberry and Klein (2012)',
                        'Population CINC 5.0',
                        'Population Measuring Worth (2019)',
                        'Population Deng (2004)',
                        
                        # GDPPC
                        'Latent GDP per capita',
                        'GDP p.c. (PPP, \u03B3) Maddison Project (2018)',
                        'GDP p.c. (PPP, \u03BB) Maddison Project (2018)',
                        'GDP p.c. (PPP) Maddison Project (2020)',
                        'GDP p.c. (PPP, \u03B3) World Bank (2020)',
                        'GDP p.c. (PPP, \u03B3) Broadberry and Klein (2012)',
                        'GNP p.c. (PPP, \u03B3) Bairoch (1976)',
                        'GDP p.c. (PPP, \u03B3) Broadberry (2015)',
                        'GDP p.c. (exchange, \u03B3) World Bank (2020)',
                        'GDP p.c. (exchange, \u03B3) Measuring Worth (2019)',
                        
                        # GDP
                        "Latent GDP",
                        'GDP (PPP, \u03B3) Penn World Tables 10.0',
                        'GDP (PPP, \u03BB) Penn World Tables 10.0',
                        'GDP (PPP) Penn World Tables 10.0',
                        'GDP (PPP, \u03B3) World Bank (2020)',
                        'GDP (PPP, \u03B3) Broadberry and Klein (2012)',
                        'GNP (PPP, \u03B3) Bairoch (1976)',
                        'GDP (exchange, \u03B3) World Bank (2020)',
                        'GDP (exchange, \u03B3) Measuring Worth (2019)'))) %>%
  mutate(ppp = ifelse(str_detect(indicator, "ppp"), "PPP", "Constant"),
         compare = case_when(
           str_detect(indicator, "bc\\_") ~ "bc",
           str_detect(indicator, "bt\\_") ~ "bt",
           str_detect(indicator, "none\\_") ~ "none",
           str_detect(indicator, "bcbt") ~ "none"
         )) %>%
  mutate(pppcompare = case_when(
    ppp == "Constant" & compare == "bc" ~ "Comparable across countries but not over time via exchange rates",
    ppp == "PPP" & compare == "bc" ~ "Comparable over time but not across countries via PPP",
    ppp == "PPP" & compare == "bt" ~  "Comparable across countries but not over time via PPP",
    ppp == "PPP" & compare == "none" ~  "Comparable across countries and over time via PPP",
    T ~ ""
  )) %>%
  arrange(group, ppp, compare, indicator) %>%
  mutate(shortname = case_when(
    indicator == "deng_pop" ~ "Deng (2004)",
    indicator == "BroadberryKlein_pop" ~ "Broadberry and Klein (2012)",
    indicator == "CINC_tpop" ~ "CINC 5.0",
    indicator == "Maddison2020_pop" ~ "Maddison Project (2020)",
    indicator == "MW_pop" ~ "Measuring Worth (2019)",
    indicator == "PWT100_pop" ~ "Penn World Tables 10.0",
    indicator == "WorldBank_pop" ~ "World Bank (2020)",
    indicator == "Bairoch_gdppc_ppp_bc_1960" ~ "Bairoch (1976)",
    indicator == "Broadberry_gdppc_ppp_bc_1990" ~ "Broadberry (2015)",
    indicator == "BroadberryKlein_gdppc_ppp_bc_1990" ~ "Broadberry and Klein (2012)",
    indicator == "Maddison2018_gdppc_ppp_bc_2011" ~ "Maddison Project (2018)",
    indicator == "Maddison2020_gdppc_ppp_bcbt" ~ "Maddison Project (2020)",
    indicator == "MW_gdppc_con_bc_2010" ~ "Measuring Worth (2019)",
    indicator == "WorldBank_gdppc_con_bc_2010" ~ "World Bank (2020)",
    indicator == "WorldBank_gdppc_ppp_bc_2017" ~ "World Bank (2020)",
    indicator == "Maddison2018_gdppc_ppp_bt_2011" ~ "Maddison Project (2018)",
    indicator == "Bairoch_gdp_ppp_bc_1960" ~ "Bairoch (1976)",
    indicator == "BroadberryKlein_gdp_ppp_bc_1990" ~ "Broadberry and Klein (2012)",
    indicator == "MW_gdp_con_bc_2010" ~ "Measuring Worth (2019)",
    indicator == "PWT100_gdp_ppp_bc_2017" ~ "Penn World Tables 10.0",
    indicator == "WorldBank_gdp_con_bc_2010" ~ "World Bank (2020)",
    indicator == "WorldBank_gdp_ppp_bc_2017" ~ "World Bank (2020)",
    indicator == "PWT100_gdp_ppp_bt_2017" ~ "Penn World Tables 10.0",
    indicator == "PWT100_gdp_ppp_none_2017" ~ "Penn World Tables 10.0",
    indicator == "latent_pop" ~ "Latent population",
    indicator == "latent_gdppc" ~ "Latent GDP per capita",
    indicator == "latent_gdp" ~ "Latent GDP"
  )) %>%
  mutate(groupname = case_when(
    group == "pop" ~ "Population",
    group == "gdp" ~ "GDP",
    group == "gdppc" ~ "GDP per capita"
  ))

df_long <- left_join(dat_long_base, df_names)

df_long_latent <- df_long %>%
  filter(indicator %in% c("Maddison2020_gdppc_ppp_bcbt", "Maddison2020_pop"))

df_long_latent_pop <- df_long_latent %>%
  filter(indicator == "Maddison2020_pop") %>%
  dplyr::select(-c(indicator, og, og_log, og_log10, ppp, compare, pppcompare, groupname, shortname, group, fullname)) %>%
  rename(Maddison2020_pop_mean_log = mean_log, 
         Maddison2020_pop_mean = mean, 
         Maddison2020_pop_mean_log10 = mean_log10, 
         Maddison2020_pop_sd_log = sd_log, 
         Maddison2020_pop_sd= sd, 
         Maddison2020_pop_sd_log10 = sd_log10)

df_long_latent_gdppc <- df_long_latent %>%
  filter(indicator == "Maddison2020_gdppc_ppp_bcbt") %>%
  dplyr::select(-c(indicator, og, og_log, og_log10, ppp, compare, pppcompare, groupname, shortname, group, fullname)) %>%
  rename(Maddison2020_gdppc_ppp_bcbt_mean_log = mean_log, 
         Maddison2020_gdppc_ppp_bcbt_mean = mean, 
         Maddison2020_gdppc_ppp_bcbt_mean_log10 = mean_log10, 
         Maddison2020_gdppc_ppp_bcbt_sd_log= sd_log, 
         Maddison2020_gdppc_ppp_bcbt_sd = sd, 
         Maddison2020_gdppc_ppp_bcbt_sd_log10 = sd_log10)

df_wide_latent_estimates <- df_long_latent_gdppc %>%
  left_join(df_long_latent_pop, by = c("gwno", "year"))

rm(df_long, dat_long_base, df_gdp, df_gdppc, df_long_latent, df_pop, df_names,  
   df_long_latent_gdppc, df_long_latent_pop)

df_wide_latent_estimates$gwno <- as.numeric(df_wide_latent_estimates$gwno)

df_wide_latent_estimates$country_id <- countrycode(df_wide_latent_estimates$gwno, origin = "gwn",  destination = "vdem", warn = TRUE )

df_wide_latent_estimates$country_id[df_wide_latent_estimates$gwno == 255] <- 77 #Germany- Prussia
df_wide_latent_estimates$country_id[df_wide_latent_estimates$gwno == 300] <- 144 # Austria-Hungary
df_wide_latent_estimates$country_id[df_wide_latent_estimates$gwno == 215] <- 157 # Czechoslovakia
df_wide_latent_estimates$country_id[df_wide_latent_estimates$gwno == 338] <- 178 # Malta
df_wide_latent_estimates$country_id[df_wide_latent_estimates$gwno == 345] <- 198 # Yugoslavia
df_wide_latent_estimates$country_id[df_wide_latent_estimates$gwno == 403] <- 196 # Sao Tome and Principe
df_wide_latent_estimates$country_id[df_wide_latent_estimates$gwno == 591] <- 199 # Seychelles
df_wide_latent_estimates$country_id[df_wide_latent_estimates$gwno == 678] <- 14 # Yemen
df_wide_latent_estimates$country_id[df_wide_latent_estimates$gwno == 730] <- 42 # Korea
df_wide_latent_estimates$country_id[df_wide_latent_estimates$gwno == 815] <- 35 # Vietnam
df_wide_latent_estimates$country_id[df_wide_latent_estimates$gwno == 816] <- 34 # Vietnam, Democratic Republic of
df_wide_latent_estimates$country_id[df_wide_latent_estimates$gwno == 817] <- 35 # Vietnam, Republic of

## Merge datasets ##

vdem <-  vdem %>%
  left_join(df_wide_latent_estimates, by = c("country_id", "year"))

vdem <- vdem %>% 
  distinct(country_id, year, .keep_all = TRUE)

#### Construct GDP growth per capita ####
# formula according to Un Statistics Devision

vdem <- vdem %>%
  group_by(country_id) %>%
  mutate(Maddison2020_gdppc_ppp_bcbt_growth_rate = ((Maddison2020_gdppc_ppp_bcbt_mean - dplyr::lag(Maddison2020_gdppc_ppp_bcbt_mean))/dplyr::lag(Maddison2020_gdppc_ppp_bcbt_mean))*100)
         
summary(vdem$Maddison2020_gdppc_ppp_bcbt_growth_rate)

#### Military Personnel Data ####

## Correlates of War National Material Capabilities Dataset ##

cow <- read.dta13("data/cow/NMC_5_0.dta")

summary(cow$milper)

cow <- cow %>%
  mutate(milper = ifelse(milper <0, NA, milper)) %>%
  rename(cown = ccode) %>%
  dplyr::select(cown, year, milper, stateabb)

summary(cow$milper)

vdem <- vdem %>%
  rename(cown = COWcode)

vdem <- vdem %>%
  left_join(cow, c("cown", "year"))

summary(vdem$milper)

mean.new <- function(v) {
  if (all(is.na(v))) { return(NA) } else { return(mean(v, na.rm=T)) }
}

vdem %>% filter(year==2012) %>%
  summarize(mean = mean.new(milper)) # last observation ins 2012 -> extrapolate military personell variable

vdem <- group_by(vdem, country_id)
vdem$milper_extrapol <- na_interpolation(vdem$milper, option = "spline")

vdem <- ungroup(vdem)

vdem <- vdem %>%
  mutate(milper_extrapol = ifelse(milper_extrapol<0,0, milper_extrapol))

vdem %>% filter(year==2019) %>%
  summarize(mean = mean.new(milper_extrapol)) # last observation ins 2012 -> extrapolate military personell variable

summary(vdem$milper_extrapol)

#### UCDP/PRIO Armed Conflict Dataset version 20.1  ####

load("data/ucdp/ucdp-prio-acd-201.RData")

ucdp_prio_acd_201 <- ucdp_prio_acd_201 %>%
  filter(type_of_conflict >2)

ucdp_prio_acd_201$cowcode <-countrycode(ucdp_prio_acd_201$location, "country.name", "cown", warn = TRUE) 

ucdp_prio_acd_201 <- ucdp_prio_acd_201 %>%
  mutate(civil_war_ucdp = 1) %>%
  dplyr::select(location, cowcode, year, civil_war_ucdp, intensity_level)

vdem <- vdem %>%
  left_join(ucdp_prio_acd_201, c("cowcode", "year"))

summary(vdem$civil_war_ucdp)

vdem$civil_war_ucdp <- replace(vdem$civil_war_ucdp, is.na(vdem$civil_war_ucdp), 0)

summary(vdem$civil_war_ucdp) # use data only after 1946!


#############################################################################################################
#############################################################################################################

#### Descriptive Statistics ####

mean_pop_latent <- vdem %>%
  group_by(year) %>%
  summarize(mean_gdppc_latent = mean.new(Maddison2020_pop_mean_log))

mean_gdppc <- vdem %>%
  group_by(year) %>%
  summarize(mean_gdppc_Mad = mean.new(Maddison2020_gdppc_ppp_bcbt_mean_log))


mean_growth <- vdem %>%
  group_by(year) %>%
  summarize(mean_growth = mean.new(Maddison2020_gdppc_ppp_bcbt_growth_rate))

vdem_usa_uk <- vdem %>%
  filter(country_name %in% c("United States of America", "United Kingdom")) %>%
  dplyr::select(country_name, year, Maddison2020_pop_mean_log, Maddison2020_gdppc_ppp_bcbt_mean_log, 
                Maddison2020_gdppc_ppp_bcbt_mean, Maddison2020_gdppc_ppp_bcbt_growth_rate, GDPgrowth)

#### Generating Binary Democracy Measures and Democratization Episodes ####

vdem <- vdem %>%
  mutate(dem = ifelse(v2x_regime >=2, 1, 
                      ifelse(v2x_regime <2, 0, NA)))
  
summary(vdem$dem)

vdem <- vdem %>%
  group_by(country_id) %>%
  mutate(dem_transition = dem - dplyr::lag(dem)) # 208 transition to democracy, 127 democratic breakdowns

vdem_list_trans_org <- vdem %>%
  filter(dem_transition %in% c(-1, 1)) %>%
  dplyr::select(country_name, year, dem, dem_transition)

table(vdem$dem_transition)  # 208 transition to democracy, 127 democratic breakdowns

## Manipulating Binary Democracy Measure (dem) in case of short-term 

vdem <- vdem %>%
  group_by(country_id) %>%
  mutate(dem = ifelse(dem==1 & dplyr::lead(dem==0) & dplyr::lag(dem==0), 0, dem))

vdem <- vdem %>%
  group_by(country_id) %>%
  mutate(dem_transition = dem - dplyr::lag(dem)) 

table(vdem$dem_transition)  # 185 transition to democracy, 104 democratic breakdowns, when short-term democratic transitions
# (1-year) were not accounted for

vdem_list_trans <- vdem %>%
  filter(dem_transition %in% c(-1, 1)) %>%
  dplyr::select(country_name, year, dem, dem_transition)

bangladesh <- vdem%>%
  filter(country_name == "Mexico") %>%
  dplyr::select(country_name, year, dem_bin_EDI, v2x_regime, v2x_polyarchy)


## Democratic transition for the full period ##

vdem <- vdem %>%
  mutate(dem_bin_EDI = ifelse(v2x_polyarchy>=0.5, 1, 0))

vdem <- vdem %>%
  group_by(country_id) %>%
  mutate(dem_transition = dem_bin_EDI - dplyr::lag(dem_bin_EDI)) 

table(vdem$dem_transition)  # 216 transition to democracy, 126 democratic breakdowns

vdem <- vdem %>%
  group_by(country_id) %>%
  mutate(dem_bin_EDI = ifelse(v2x_polyarchy>=0.5  & dplyr::lead(v2x_polyarchy<0.5) | dplyr::lead(n=2, v2x_polyarchy<0.5), 0, dem_bin_EDI)) 
# accounting for short term democratic transitions 


vdem <- vdem %>%
  group_by(country_id) %>%
  mutate(dem_transition = dem_bin_EDI - dplyr::lag(dem_bin_EDI)) 

table(vdem$dem_transition)  # 173 transition to democracy, 85 democratic breakdowns

vdem_list_trans <- vdem %>%
  filter(dem_transition %in% c(-1, 1)) %>%
  dplyr::select(country_name, year, dem_bin_EDI, dem_transition, v2x_polyarchy)


## Number Democratization and Autocratization Periods ##

vdem %>%
  count(dem_ep_id) %>%
  drop_na()# 348 democratization episodes in 158 countries

#### Select Variables ####

vdem_data <- vdem %>%
  dplyr::select(country_id, country_name, year, starts_with("dem_ep"), v2x_polyarchy, e_regiongeo, 
         e_migdpgro, e_migdpgrolns, e_migdppc, e_migdppcln, e_total_resources_income_pc, 
         starts_with("Maddison"),  
         milper, civil_war_ucdp, regime_duration, 
          e_mipopula, e_pefeliex, e_civil_war, e_miinteco, e_miinterc, 
         v2svstterr, v2stfisccap,  v2clrspct, year, v2x_regime, autocracy,  
          dem, dem_bin_EDI, v2x_libdem, v2x_api, v2x_mpi, v2x_liberal, v2x_partip, e_regionpol_6C, 
         e_total_resources_income_pc)

#### Table 1 ####

cols1 <- c("dem", "dem_bin_EDI", "dem_ep", "Maddison2020_gdppc_ppp_bcbt_mean_log",
           "Maddison2020_pop_mean_log", "Maddison2020_gdppc_ppp_bcbt_growth_rate", 
           "v2x_polyarchy", "v2svstterr", "v2clrspct", "milper", "regime_duration") 

vdem <- distinct(vdem, country_id, year, .keep_all= TRUE) # Distinct observations

stargazer(
  title="Summary Statistics for Dataset", 
  as.data.frame(vdem[, cols1]), type = "latex", 
  summary.stat = c("n", "mean", "median", "min", "max", "sd"))


vdem_data <- distinct(vdem_data, country_id, year, .keep_all= TRUE) # Distinct observations


#### 5 year average growth prior to democratization ####

vdem_data <- vdem_data %>%
  group_by(country_id) %>%
  mutate(Maddison2020_gdppc_ppp_bcbt_growth_rate_rolling_5_average = (lag(n = 1, Maddison2020_gdppc_ppp_bcbt_growth_rate) + lag(n = 2, Maddison2020_gdppc_ppp_bcbt_growth_rate) + lag(n = 3, Maddison2020_gdppc_ppp_bcbt_growth_rate) + lag(n = 4, Maddison2020_gdppc_ppp_bcbt_growth_rate)+  lag(n = 5, Maddison2020_gdppc_ppp_bcbt_growth_rate))/5)
summary(vdem_data$Maddison2020_gdppc_ppp_bcbt_growth_rate_rolling_5_average)

vdem_data <- vdem_data %>%
  rename(gdppc_ppp_bcbt_mean_log = Maddison2020_gdppc_ppp_bcbt_mean_log, 
         gdppc_ppp_bcbt_mean = Maddison2020_gdppc_ppp_bcbt_mean , 
         gdppc_ppp_bcbt_mean_log10 = Maddison2020_gdppc_ppp_bcbt_mean_log10, 
         gdppc_ppp_bcbt_sd_log = Maddison2020_gdppc_ppp_bcbt_sd_log, 
         gdppc_ppp_bcbt_sd_log10 = Maddison2020_gdppc_ppp_bcbt_sd_log10, 
         gdppc_ppp_bcbt_growth_rate = Maddison2020_gdppc_ppp_bcbt_growth_rate, 
         gdppc_ppp_bcbt_growth_rate_5avg = Maddison2020_gdppc_ppp_bcbt_growth_rate_rolling_5_average)

saveRDS(vdem_data, file = "data/vdem_data.rds")
write_dta(vdem_data, "data/vdem_data.dta", version = 14, label = attr(data, "label"))

#### Mean Number of Country Year per country ####

country_T_mean <- vdem_data %>%
  drop_na(v2x_regime, gdppc_ppp_bcbt_mean_log) %>%
  group_by(country_name) %>%
  summarize(number_of_years = n()) %>%
  summarize(mean_years = mean(number_of_years)) %>%
  as.numeric()


country_first_democracies <- vdem_data %>%
  group_by(country_id) %>%
  filter(dem_bin_EDI == 1) %>%
  summarize(country_name = first(country_name), 
            year = first(year)) %>%
  ungroup() %>%
  dplyr::select(-country_id) %>%
  arrange(year)

stargazer(
  title="Countries by democracy statis", 
  country_first_democracies, type = "latex", summary= FALSE)


#### Calculate number of democratic transitions and breakdowns ####

## Step 1: Binary Dem ##

vdem_list_trans <- vdem %>%
  group_by(country_id) %>%
  mutate(dem_transition = dem - lag(dem)) %>%
  filter(dem_transition %in% c(-1, 1)) %>%
  dplyr::select(country_name, year, dem, dem_transition)

table(vdem_list_trans$dem_transition)  # 185 transition to democracy, 104 democratic breakdowns

## Step 2: Binary Dem Data by EDI 

vdem_list_trans <- vdem %>%
  group_by(country_id) %>%
  mutate(dem_transition = dem_bin_EDI - lag(dem_bin_EDI)) %>%
  filter(dem_transition %in% c(-1, 1)) %>%
  dplyr::select(country_name, year, v2x_polyarchy, dem_bin_EDI, dem_transition)

table(vdem_list_trans$dem_transition)  # 109 transition to democracy, 49 democratic breakdowns

## Step 3: ERT data ##

vdem_list_trans_ert <-vdem %>%
  filter(!is.na(dem_ep_id)) %>%
  distinct(dem_ep_id, .keep_all = TRUE)

vdem_list_trans_ert_country <-vdem %>%
  filter(!is.na(dem_ep_id)) %>%
  distinct(dem_ep_id, .keep_all = TRUE) %>%
  distinct(country_name)

#### Prepare Data for IV Regression (Additional Robustness Tests) ####

sum.new <- function(v) {
  if (all(is.na(v))) { return(NA) } else { return(sum(v, na.rm=T)) }
}

## Construct a Instrument Variable, as reported in ANRR (2019) ##

vdem <- vdem %>%
  group_by(e_regionpol_6C, year) %>%
  mutate(number_countries_region = n(),
         number_democracy_per_region_dummy = sum.new(dem)) %>%
  ungroup() %>%
  mutate(number_democracy_per_region = ifelse(dem == 1, number_democracy_per_region_dummy-1, number_democracy_per_region_dummy), 
         demregion = number_democracy_per_region/number_countries_region) 
# jackknife number of democracies per region, see ANRR 2019, p. 82
summary(vdem$demregion)

#### Full Sample number of democracies ####

vdem <- vdem %>%
  group_by(e_regionpol_6C, year) %>%
  mutate(number_countries_region = n(),
         number_democracy_per_region_dummy = sum.new(dem_bin_EDI)) %>%
  ungroup() %>%
  mutate(number_democracy_per_region = ifelse(dem_bin_EDI == 1, number_democracy_per_region_dummy-1, number_democracy_per_region_dummy), 
         demregion_EDI = number_democracy_per_region/number_countries_region) 
# jackknife number of democracies per region, see ANRR 2019, p. 82
summary(vdem$demregion_EDI)


#### Generate yy year dummies ####

# create a function
make_dummies <- function(v, prefix = '') {
  s <- sort(unique(v))
  d <- outer(v, s, function(v, s) 1L * (v == s))
  colnames(d) <- paste0(prefix, s)
  d
}

# bind the dummies to the original dataframe
dummy_years <- make_dummies(vdem$year, prefix = 'yy')

vdem <- cbind(vdem, as_tibble(dummy_years))

vdem_data <- vdem %>%
  dplyr::select(country_id, country_name, year, starts_with("dem_ep"), v2x_polyarchy, e_regiongeo, 
                e_migdpgro, e_migdpgrolns, e_migdppc, e_migdppcln, e_total_resources_income_pc, 
                starts_with("Maddison"),  
                milper, civil_war_ucdp, regime_duration, 
                e_mipopula, e_pefeliex, e_civil_war, e_miinteco, e_miinterc, 
                v2svstterr, v2stfisccap,  v2clrspct, year, v2x_regime, autocracy,  
                dem, dem_bin_EDI, demregion, demregion_EDI, starts_with("yy"))


vdem_data <- vdem_data %>%
  group_by(country_id) %>%
  mutate(Maddison2020_gdppc_ppp_bcbt_growth_rate_rolling_5_average = (lag(n = 1, Maddison2020_gdppc_ppp_bcbt_growth_rate) + lag(n = 2, Maddison2020_gdppc_ppp_bcbt_growth_rate) + lag(n = 3, Maddison2020_gdppc_ppp_bcbt_growth_rate) + lag(n = 4, Maddison2020_gdppc_ppp_bcbt_growth_rate)+  lag(n = 5, Maddison2020_gdppc_ppp_bcbt_growth_rate))/5)
summary(vdem_data$Maddison2020_gdppc_ppp_bcbt_growth_rate_rolling_5_average)

vdem_data <- vdem_data %>%
  rename(gdppc_ppp_bcbt_mean_log = Maddison2020_gdppc_ppp_bcbt_mean_log, 
         gdppc_ppp_bcbt_mean = Maddison2020_gdppc_ppp_bcbt_mean , 
         gdppc_ppp_bcbt_mean_log10 = Maddison2020_gdppc_ppp_bcbt_mean_log10, 
         gdppc_ppp_bcbt_sd_log = Maddison2020_gdppc_ppp_bcbt_sd_log, 
         gdppc_ppp_bcbt_sd_log10 = Maddison2020_gdppc_ppp_bcbt_sd_log10, 
         gdppc_ppp_bcbt_growth_rate = Maddison2020_gdppc_ppp_bcbt_growth_rate, 
         gdppc_ppp_bcbt_growth_rate_5avg = Maddison2020_gdppc_ppp_bcbt_growth_rate_rolling_5_average)

saveRDS(vdem_data, file = "data/vdem_data_IV_regression.rds")
write_dta(vdem_data, "data/vdem_data_IV_regression.dta", version = 14, label = attr(data, "label"))


